library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /Users/itsjoeoui/Developer/McGill/MATH208
HTRU2 <- read_csv(here("HTRU2/HTRU_2.csv"), col_names = FALSE)
## Rows: 17898 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): X1, X2, X3, X4, X5, X6, X7, X8, X9
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(HTRU2) = c("Mean_IP", "SD_IP", "EK_IP", "SKW_IP",
                 "Mean_DMSNR", "SD_DMSNR", "EK_DMSNR", "SKW_DMSNR","Class")
head(HTRU2)
## # A tibble: 6 × 9
##   Mean_IP SD_IP   EK_IP SKW_IP Mean_DMSNR SD_DMSNR EK_DMSNR SKW_DMSNR Class
##     <dbl> <dbl>   <dbl>  <dbl>      <dbl>    <dbl>    <dbl>     <dbl> <dbl>
## 1   141.   55.7 -0.235  -0.700       3.20     19.1     7.98      74.2     0
## 2   103.   58.9  0.465  -0.515       1.68     14.9    10.6      127.      0
## 3   103.   39.3  0.323   1.05        3.12     21.7     7.74      63.2     0
## 4   137.   57.2 -0.0684 -0.636       3.64     21.0     6.90      53.6     0
## 5    88.7  40.7  0.601   1.12        1.18     11.5    14.3      253.      0
## 6    93.6  46.7  0.532   0.417       1.64     14.5    10.6      131.      0
HTRU2 <- HTRU2 %>% mutate(Class=ifelse(Class==0, "Negative", "Positive"))
HTRU2 %>% summarise(
      Average = mean(Mean_IP), Medium = median(Mean_IP),
      '25%ile' = quantile(Mean_IP, 0.25), 
      '75%ile' = quantile(Mean_IP, 0.75), 
      StD = sd(Mean_IP), IQR = IQR(Mean_IP)
  )
## # A tibble: 1 × 6
##   Average Medium `25%ile` `75%ile`   StD   IQR
##     <dbl>  <dbl>    <dbl>    <dbl> <dbl> <dbl>
## 1    111.   115.     101.     127.  25.7  26.2
HTRU2 %>% group_by(Class) %>%
  summarise(
      Average = mean(Mean_IP), Medium = median(Mean_IP),
      '25%ile' = quantile(Mean_IP, 0.25), 
      '75%ile' = quantile(Mean_IP, 0.75), 
      StD = sd(Mean_IP), IQR = IQR(Mean_IP)
  )
## # A tibble: 2 × 7
##   Class    Average Medium `25%ile` `75%ile`   StD   IQR
##   <chr>      <dbl>  <dbl>    <dbl>    <dbl> <dbl> <dbl>
## 1 Negative   117.   117.     105.     128.   17.5  23.0
## 2 Positive    56.7   54.3     31.8     79.3  30.0  47.5
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median))
## # A tibble: 2 × 5
##   Class    Mean_IP_Avg Mean_DMSNR_Avg Mean_IP_Med Mean_DMSNR_Med
##   <chr>          <dbl>          <dbl>       <dbl>          <dbl>
## 1 Negative       117.            8.86       117.            2.64
## 2 Positive        56.7          49.8         54.3          33.5
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median)) %>% pivot_longer(cols=c(Mean_IP_Avg, Mean_DMSNR_Avg, Mean_IP_Med, Mean_DMSNR_Med), names_to = "Measure") %>% arrange(desc(Measure))
## # A tibble: 8 × 3
##   Class    Measure         value
##   <chr>    <chr>           <dbl>
## 1 Negative Mean_IP_Med    117.  
## 2 Positive Mean_IP_Med     54.3 
## 3 Negative Mean_IP_Avg    117.  
## 4 Positive Mean_IP_Avg     56.7 
## 5 Negative Mean_DMSNR_Med   2.64
## 6 Positive Mean_DMSNR_Med  33.5 
## 7 Negative Mean_DMSNR_Avg   8.86
## 8 Positive Mean_DMSNR_Avg  49.8
HTRU2 %>% group_by(Class) %>% select(Class, Mean_IP, Mean_DMSNR) %>% summarise_all(list(Avg=mean, Med=median)) %>% pivot_longer(cols=starts_with("Mean"), names_to = "Measure") %>% arrange(desc(Measure))
## # A tibble: 8 × 3
##   Class    Measure         value
##   <chr>    <chr>           <dbl>
## 1 Negative Mean_IP_Med    117.  
## 2 Positive Mean_IP_Med     54.3 
## 3 Negative Mean_IP_Avg    117.  
## 4 Positive Mean_IP_Avg     56.7 
## 5 Negative Mean_DMSNR_Med   2.64
## 6 Positive Mean_DMSNR_Med  33.5 
## 7 Negative Mean_DMSNR_Avg   8.86
## 8 Positive Mean_DMSNR_Avg  49.8
HTRU2 %>% 
  group_by(Class) %>% 
  select(Class, Mean_IP, Mean_DMSNR) %>% 
  summarise_all(list(
    Avg = mean, 
    Med = ~median(.), 
    Q25 = ~quantile(.,probs=c(0.25)),
    Q75 = ~quantile(.,0.75)
    )) %>% 
  pivot_longer(cols=starts_with("Mean"), names_to = "Measure") %>% 
  pivot_wider(id_cols=Measure, names_from=Class) %>% 
  arrange(desc(Measure))
## # A tibble: 8 × 3
##   Measure        Negative Positive
##   <chr>             <dbl>    <dbl>
## 1 Mean_IP_Q75      128.       79.3
## 2 Mean_IP_Q25      105.       31.8
## 3 Mean_IP_Med      117.       54.3
## 4 Mean_IP_Avg      117.       56.7
## 5 Mean_DMSNR_Q75     4.23     78.3
## 6 Mean_DMSNR_Q25     1.86     12.8
## 7 Mean_DMSNR_Med     2.64     33.5
## 8 Mean_DMSNR_Avg     8.86     49.8
HTRU2 %>% 
  group_by(Class) %>% 
  summarise(Cor_MeanIP_Mean_DMSNR = cor(Mean_IP, Mean_DMSNR))
## # A tibble: 2 × 2
##   Class    Cor_MeanIP_Mean_DMSNR
##   <chr>                    <dbl>
## 1 Negative                 0.117
## 2 Positive                -0.542
ggplot(HTRU2, aes(x=Mean_IP, y=Mean_DMSNR, col=Class)) + 
  geom_point() + facet_wrap(~Class) +
  labs(x="Mean IP", y="Mean DMNSR", title="Mean IP vs. Mean DMNSR") +
  theme(legend.position = "none") + 
  geom_smooth(method="lm", col="black")
## `geom_smooth()` using formula 'y ~ x'

HTRU2 <- HTRU2 %>% mutate(Neg_MDMSNR=-Mean_DMSNR)
HTRU2 %>% group_by(Class) %>% summarise(Cor2 = cor(Mean_IP, Neg_MDMSNR))
## # A tibble: 2 × 2
##   Class      Cor2
##   <chr>     <dbl>
## 1 Negative -0.117
## 2 Positive  0.542
ggplot(HTRU2,aes(x=Mean_IP,y=Neg_MDMSNR,col=Class)) +
  geom_point() + facet_wrap(~Class) +
  labs(x="Mean IP", y="Mean DMSNR", title="Mean IP vs. Mean DMSNR") +
  theme(legend.position = "none") +
  geom_smooth(method="lm",col="black")
## `geom_smooth()` using formula 'y ~ x'

crime <- read_csv(here("BostonCrime/crime.csv"))
## Rows: 327820 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): INCIDENT_NUMBER, OFFENSE_CODE_GROUP, OFFENSE_DESCRIPTION, DISTRICT...
## dbl  (7): OFFENSE_CODE, REPORTING_AREA, YEAR, MONTH, HOUR, Lat, Long
## lgl  (1): SHOOTING
## dttm (1): OCCURRED_ON_DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(crime)
## # A tibble: 6 × 17
##   INCIDENT…¹ OFFEN…² OFFEN…³ OFFEN…⁴ DISTR…⁵ REPOR…⁶ SHOOT…⁷ OCCURRED_ON_DATE   
##   <chr>        <dbl> <chr>   <chr>   <chr>     <dbl> <lgl>   <dttm>             
## 1 I182080058    2403 Disord… DISTUR… E18         495 FALSE   2018-10-03 20:13:00
## 2 I182080053    3201 Proper… PROPER… D14         795 FALSE   2018-08-30 20:00:00
## 3 I182080052    2647 Other   THREAT… B2          329 FALSE   2018-10-03 19:20:00
## 4 I182080051     413 Aggrav… ASSAUL… A1           92 FALSE   2018-10-03 20:00:00
## 5 I182080050    3122 Aircra… AIRCRA… A7           36 FALSE   2018-10-03 20:49:00
## 6 I182080049    1402 Vandal… VANDAL… C11         351 FALSE   2018-10-02 20:40:00
## # … with 9 more variables: YEAR <dbl>, MONTH <dbl>, DAY_OF_WEEK <chr>,
## #   HOUR <dbl>, UCR_PART <chr>, STREET <chr>, Lat <dbl>, Long <dbl>,
## #   Location <chr>, and abbreviated variable names ¹​INCIDENT_NUMBER,
## #   ²​OFFENSE_CODE, ³​OFFENSE_CODE_GROUP, ⁴​OFFENSE_DESCRIPTION, ⁵​DISTRICT,
## #   ⁶​REPORTING_AREA, ⁷​SHOOTING
names(crime)
##  [1] "INCIDENT_NUMBER"     "OFFENSE_CODE"        "OFFENSE_CODE_GROUP" 
##  [4] "OFFENSE_DESCRIPTION" "DISTRICT"            "REPORTING_AREA"     
##  [7] "SHOOTING"            "OCCURRED_ON_DATE"    "YEAR"               
## [10] "MONTH"               "DAY_OF_WEEK"         "HOUR"               
## [13] "UCR_PART"            "STREET"              "Lat"                
## [16] "Long"                "Location"
str(crime)
## spec_tbl_df [327,820 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ INCIDENT_NUMBER    : chr [1:327820] "I182080058" "I182080053" "I182080052" "I182080051" ...
##  $ OFFENSE_CODE       : num [1:327820] 2403 3201 2647 413 3122 ...
##  $ OFFENSE_CODE_GROUP : chr [1:327820] "Disorderly Conduct" "Property Lost" "Other" "Aggravated Assault" ...
##  $ OFFENSE_DESCRIPTION: chr [1:327820] "DISTURBING THE PEACE" "PROPERTY - LOST" "THREATS TO DO BODILY HARM" "ASSAULT - AGGRAVATED - BATTERY" ...
##  $ DISTRICT           : chr [1:327820] "E18" "D14" "B2" "A1" ...
##  $ REPORTING_AREA     : num [1:327820] 495 795 329 92 36 351 NA 603 543 621 ...
##  $ SHOOTING           : logi [1:327820] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ OCCURRED_ON_DATE   : POSIXct[1:327820], format: "2018-10-03 20:13:00" "2018-08-30 20:00:00" ...
##  $ YEAR               : num [1:327820] 2018 2018 2018 2018 2018 ...
##  $ MONTH              : num [1:327820] 10 8 10 10 10 10 10 10 10 10 ...
##  $ DAY_OF_WEEK        : chr [1:327820] "Wednesday" "Thursday" "Wednesday" "Wednesday" ...
##  $ HOUR               : num [1:327820] 20 20 19 20 20 20 20 19 19 20 ...
##  $ UCR_PART           : chr [1:327820] "Part Two" "Part Three" "Part Two" "Part One" ...
##  $ STREET             : chr [1:327820] "ARLINGTON ST" "ALLSTON ST" "DEVON ST" "CAMBRIDGE ST" ...
##  $ Lat                : num [1:327820] 42.3 42.4 42.3 42.4 42.4 ...
##  $ Long               : num [1:327820] -71.1 -71.1 -71.1 -71.1 -71 ...
##  $ Location           : chr [1:327820] "(42.26260773, -71.12118637)" "(42.35211146, -71.13531147)" "(42.30812619, -71.07692974)" "(42.35945371, -71.05964817)" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   INCIDENT_NUMBER = col_character(),
##   ..   OFFENSE_CODE = col_double(),
##   ..   OFFENSE_CODE_GROUP = col_character(),
##   ..   OFFENSE_DESCRIPTION = col_character(),
##   ..   DISTRICT = col_character(),
##   ..   REPORTING_AREA = col_double(),
##   ..   SHOOTING = col_logical(),
##   ..   OCCURRED_ON_DATE = col_datetime(format = ""),
##   ..   YEAR = col_double(),
##   ..   MONTH = col_double(),
##   ..   DAY_OF_WEEK = col_character(),
##   ..   HOUR = col_double(),
##   ..   UCR_PART = col_character(),
##   ..   STREET = col_character(),
##   ..   Lat = col_double(),
##   ..   Long = col_double(),
##   ..   Location = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
crime %>% group_by(DAY_OF_WEEK) %>% summarise(count=n()) %>% mutate(prop=count/sum(count))
## # A tibble: 7 × 3
##   DAY_OF_WEEK count  prop
##   <chr>       <int> <dbl>
## 1 Friday      49758 0.152
## 2 Monday      46970 0.143
## 3 Saturday    45969 0.140
## 4 Sunday      41374 0.126
## 5 Thursday    47872 0.146
## 6 Tuesday     47726 0.146
## 7 Wednesday   48151 0.147
crime %>% group_by(DAY_OF_WEEK) %>% summarise(count=n()) %>% mutate(prop=count/sum(count)) %>% arrange(desc(count))
## # A tibble: 7 × 3
##   DAY_OF_WEEK count  prop
##   <chr>       <int> <dbl>
## 1 Friday      49758 0.152
## 2 Wednesday   48151 0.147
## 3 Thursday    47872 0.146
## 4 Tuesday     47726 0.146
## 5 Monday      46970 0.143
## 6 Saturday    45969 0.140
## 7 Sunday      41374 0.126
crime %>% 
  group_by(MONTH) %>% 
  summarise(count=n()) %>% 
  mutate(prop=count/sum(count)) %>%
  arrange(MONTH)
## # A tibble: 12 × 3
##    MONTH count   prop
##    <dbl> <int>  <dbl>
##  1     1 23625 0.0721
##  2     2 21661 0.0661
##  3     3 24156 0.0737
##  4     4 24108 0.0735
##  5     5 26242 0.0801
##  6     6 30622 0.0934
##  7     7 34640 0.106 
##  8     8 35137 0.107 
##  9     9 34023 0.104 
## 10    10 26437 0.0806
## 11    11 23685 0.0723
## 12    12 23484 0.0716
month.abb
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
crime <- crime %>% mutate(Month = month.abb[MONTH])
crime %>% select(MONTH, Month) %>% slice(1:5)
## # A tibble: 5 × 2
##   MONTH Month
##   <dbl> <chr>
## 1    10 Oct  
## 2     8 Aug  
## 3    10 Oct  
## 4    10 Oct  
## 5    10 Oct
crime %>% group_by(Month) %>% summarise(count=n()) %>%
  mutate(prop = count/sum(count)) %>% arrange(Month)
## # A tibble: 12 × 3
##    Month count   prop
##    <chr> <int>  <dbl>
##  1 Apr   24108 0.0735
##  2 Aug   35137 0.107 
##  3 Dec   23484 0.0716
##  4 Feb   21661 0.0661
##  5 Jan   23625 0.0721
##  6 Jul   34640 0.106 
##  7 Jun   30622 0.0934
##  8 Mar   24156 0.0737
##  9 May   26242 0.0801
## 10 Nov   23685 0.0723
## 11 Oct   26437 0.0806
## 12 Sep   34023 0.104
courses = c("MATH 203", "MATH 204", "MATH 208", "MATH 324",
            "MATH 423", "MATH 447","MATH 523", "MATH 525", 
            "MATH 533", "MATH 545")
class(courses)
## [1] "character"
courses_fct = factor(courses)
mode(courses_fct)
## [1] "numeric"
attributes(courses_fct)
## $levels
##  [1] "MATH 203" "MATH 204" "MATH 208" "MATH 324" "MATH 423" "MATH 447"
##  [7] "MATH 523" "MATH 525" "MATH 533" "MATH 545"
## 
## $class
## [1] "factor"
crime <- crime %>% 
  mutate(Day_of_week=fct_relevel(DAY_OF_WEEK, c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")))
crime %>% group_by(Day_of_week) %>% summarize(count=n()) %>% mutate(prop=count/sum(count))
## # A tibble: 7 × 3
##   Day_of_week count  prop
##   <fct>       <int> <dbl>
## 1 Monday      46970 0.143
## 2 Tuesday     47726 0.146
## 3 Wednesday   48151 0.147
## 4 Thursday    47872 0.146
## 5 Friday      49758 0.152
## 6 Saturday    45969 0.140
## 7 Sunday      41374 0.126
crime <- crime %>% mutate(Month = fct_relevel(Month, month.abb))
crime_by_month = crime %>% group_by(Month) %>% summarise(count=n()) %>% mutate(prop=count/sum(count)) %>% arrange(Month)
crime_by_month
## # A tibble: 12 × 3
##    Month count   prop
##    <fct> <int>  <dbl>
##  1 Jan   23625 0.0721
##  2 Feb   21661 0.0661
##  3 Mar   24156 0.0737
##  4 Apr   24108 0.0735
##  5 May   26242 0.0801
##  6 Jun   30622 0.0934
##  7 Jul   34640 0.106 
##  8 Aug   35137 0.107 
##  9 Sep   34023 0.104 
## 10 Oct   26437 0.0806
## 11 Nov   23685 0.0723
## 12 Dec   23484 0.0716
ggplot(crime_by_month, aes(x="", y=count, fill=Month)) + 
  geom_bar(stat="identity") +
  coord_polar("y", start=0) 

ggplot(crime_by_month, aes(x="", y=prop, fill=Month)) + 
  geom_bar(stat="identity") +
  coord_polar("y", start=0) 

ggplot(crime_by_month, aes(x="", y=count, fill=Month)) + 
  geom_bar(stat="identity") +
  coord_polar("y", start=0) +
  scale_fill_viridis_d()

ggplot(crime, aes(x=Month, fill=Month)) + 
  geom_bar() + 
  scale_fill_viridis_d() + 
  ylab("Total number of crimes")

ggplot(crime, aes(x=Month, fill=Month)) + 
  geom_bar(aes(y=..count../sum(..count..))) + 
  scale_fill_viridis_d() + 
  ylab("Total number of crimes")

crime %>% group_by(OFFENSE_CODE_GROUP) %>%
  summarise(count=n()) %>%
  mutate(prop=count/sum(count)) %>%
  arrange(desc(prop))
## # A tibble: 67 × 3
##    OFFENSE_CODE_GROUP              count   prop
##    <chr>                           <int>  <dbl>
##  1 Motor Vehicle Accident Response 38134 0.116 
##  2 Larceny                         26670 0.0814
##  3 Medical Assistance              24226 0.0739
##  4 Investigate Person              19176 0.0585
##  5 Other                           18612 0.0568
##  6 Drug Violation                  17037 0.0520
##  7 Simple Assault                  16263 0.0496
##  8 Vandalism                       15810 0.0482
##  9 Verbal Disputes                 13478 0.0411
## 10 Towed                           11632 0.0355
## # … with 57 more rows
off_code_counts <- crime %>% 
  group_by(OFFENSE_CODE_GROUP) %>%
  summarise(count=n()) %>% 
  mutate(prop=count/sum(count))
ggplot(off_code_counts, aes(x=OFFENSE_CODE_GROUP, fill=OFFENSE_CODE_GROUP)) +
  geom_bar(stat='identity', aes(y=prop)) + scale_fill_viridis_d() + ylab("Proportion of crime")

crime <- crime %>%
  mutate(code_lmp=fct_lump(OFFENSE_CODE_GROUP, 12))
head(crime)
## # A tibble: 6 × 20
##   INCIDENT…¹ OFFEN…² OFFEN…³ OFFEN…⁴ DISTR…⁵ REPOR…⁶ SHOOT…⁷ OCCURRED_ON_DATE   
##   <chr>        <dbl> <chr>   <chr>   <chr>     <dbl> <lgl>   <dttm>             
## 1 I182080058    2403 Disord… DISTUR… E18         495 FALSE   2018-10-03 20:13:00
## 2 I182080053    3201 Proper… PROPER… D14         795 FALSE   2018-08-30 20:00:00
## 3 I182080052    2647 Other   THREAT… B2          329 FALSE   2018-10-03 19:20:00
## 4 I182080051     413 Aggrav… ASSAUL… A1           92 FALSE   2018-10-03 20:00:00
## 5 I182080050    3122 Aircra… AIRCRA… A7           36 FALSE   2018-10-03 20:49:00
## 6 I182080049    1402 Vandal… VANDAL… C11         351 FALSE   2018-10-02 20:40:00
## # … with 12 more variables: YEAR <dbl>, MONTH <dbl>, DAY_OF_WEEK <chr>,
## #   HOUR <dbl>, UCR_PART <chr>, STREET <chr>, Lat <dbl>, Long <dbl>,
## #   Location <chr>, Month <fct>, Day_of_week <fct>, code_lmp <fct>, and
## #   abbreviated variable names ¹​INCIDENT_NUMBER, ²​OFFENSE_CODE,
## #   ³​OFFENSE_CODE_GROUP, ⁴​OFFENSE_DESCRIPTION, ⁵​DISTRICT, ⁶​REPORTING_AREA,
## #   ⁷​SHOOTING
off_code_counts_lmp <- crime %>%
  group_by(code_lmp) %>%
  count() %>%
  ungroup() %>%
  mutate(prop=n/sum(n)) %>%
  arrange(n)
off_code_counts_lmp
## # A tibble: 12 × 3
##    code_lmp                             n   prop
##    <fct>                            <int>  <dbl>
##  1 Larceny From Motor Vehicle       11120 0.0339
##  2 Investigate Property             11443 0.0349
##  3 Towed                            11632 0.0355
##  4 Verbal Disputes                  13478 0.0411
##  5 Vandalism                        15810 0.0482
##  6 Simple Assault                   16263 0.0496
##  7 Drug Violation                   17037 0.0520
##  8 Investigate Person               19176 0.0585
##  9 Medical Assistance               24226 0.0739
## 10 Larceny                          26670 0.0814
## 11 Motor Vehicle Accident Response  38134 0.116 
## 12 Other                           122831 0.375
ggplot(off_code_counts_lmp, aes(x=code_lmp, fill=code_lmp)) + geom_bar(stat="identity", aes(y=prop)) + scale_fill_viridis_d() + ylab("Proportion of crimes")

off_code_counts_lmp <- off_code_counts_lmp %>%
  mutate(code_lmp = fct_reorder(code_lmp, n,.desc=TRUE))

ggplot(off_code_counts_lmp,aes(x=code_lmp,fill=code_lmp)) +
  geom_bar(stat="identity",aes(y=prop)) + 
  scale_fill_viridis_d() +
  ylab("Proportion of crimes")

library(treemapify)
ggplot(off_code_counts_lmp, aes(area=n, fill=code_lmp)) + geom_treemap() + scale_fill_viridis_d()

off_code_counts <- off_code_counts %>% mutate(OFFENSE_CODE_GROUP = fct_reorder(OFFENSE_CODE_GROUP,count, .desc=TRUE))
p <- ggplot(off_code_counts, aes(area=count, fill=OFFENSE_CODE_GROUP)) + geom_treemap()
class(p)
## [1] "gg"     "ggplot"
attributes(p)
## $names
## [1] "data"        "layers"      "scales"      "mapping"     "theme"      
## [6] "coordinates" "facet"       "plot_env"    "labels"     
## 
## $class
## [1] "gg"     "ggplot"
print(p)

p + theme(legend.position = "none")

library(ggpubr)
as_ggplot(get_legend(p+theme(legend.text=element_text(size=8))))

crime = crime %>% mutate(code_lmp = fct_infreq(code_lmp))
ggplot(crime, aes(x=Month, fill=code_lmp)) + geom_bar() + scale_fill_viridis_d()

ggplot(crime, aes(x=code_lmp, fill=code_lmp)) + 
  geom_bar(position="dodge")+facet_wrap(~Month) + 
  scale_fill_viridis_d() + 
  theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank()
        )

library(ggmosaic)
ggplot(crime) + geom_mosaic(aes(x=product(code_lmp, Day_of_week),  fill=Day_of_week)) +
    theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## Please use `unite()` instead.

ggplot(crime) + geom_mosaic(aes(x=product(Day_of_week, code_lmp),  fill=code_lmp)) +
    theme(axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

crime %>% select(OCCURRED_ON_DATE) %>% head(20)
## # A tibble: 20 × 1
##    OCCURRED_ON_DATE   
##    <dttm>             
##  1 2018-10-03 20:13:00
##  2 2018-08-30 20:00:00
##  3 2018-10-03 19:20:00
##  4 2018-10-03 20:00:00
##  5 2018-10-03 20:49:00
##  6 2018-10-02 20:40:00
##  7 2018-10-03 20:16:00
##  8 2018-10-03 19:32:00
##  9 2018-10-03 19:27:00
## 10 2018-10-03 20:00:00
## 11 2018-10-03 19:33:00
## 12 2018-10-01 20:00:00
## 13 2018-10-03 17:18:00
## 14 2018-10-03 08:00:00
## 15 2018-10-03 19:58:00
## 16 2018-10-03 19:30:00
## 17 2018-10-03 18:35:00
## 18 2018-10-03 19:56:00
## 19 2018-10-03 18:41:00
## 20 2018-10-03 18:18:00
crime %>% summarise(min=min(OCCURRED_ON_DATE), med = median(OCCURRED_ON_DATE), max = max(OCCURRED_ON_DATE))
## # A tibble: 1 × 3
##   min                 med                 max                
##   <dttm>              <dttm>              <dttm>             
## 1 2015-06-15 00:00:00 2017-02-14 15:49:00 2018-10-03 20:49:00
crime %>% pull(OCCURRED_ON_DATE) %>% class(.)
## [1] "POSIXct" "POSIXt"
crime %>% pull(OCCURRED_ON_DATE) %>% head(20) %>% as.numeric(.)
##  [1] 1538597580 1535659200 1538594400 1538596800 1538599740 1538512800
##  [7] 1538597760 1538595120 1538594820 1538596800 1538595180 1538424000
## [13] 1538587080 1538553600 1538596680 1538595000 1538591700 1538596560
## [19] 1538592060 1538590680
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
my_date <- "2003-05-29"
class(my_date)
## [1] "character"
my_date <- ymd(my_date)
class(my_date)
## [1] "Date"
other_date = ymd_hms("2009-05-02 02:57:00", tz="America/Montreal")
other_date
## [1] "2009-05-02 02:57:00 EDT"
my_date + days(-2:2)
## [1] "2003-05-27" "2003-05-28" "2003-05-29" "2003-05-30" "2003-05-31"
my_date + months(-1:1)
## [1] "2003-04-29" "2003-05-29" "2003-06-29"
now()
## [1] "2022-10-06 17:47:11 EDT"
today() - my_date
## Time difference of 7070 days
interval(other_date, now()) / years(1)
## [1] 13.43183
by_date_tbl = crime %>% mutate(date_only = date(OCCURRED_ON_DATE)) %>%
  group_by(date_only) %>%
  summarise(count=n())
by_date_tbl %>% arrange(desc(count)) %>% head(5)
## # A tibble: 5 × 2
##   date_only  count
##   <date>     <int>
## 1 2016-09-01   379
## 2 2017-09-01   377
## 3 2018-06-15   376
## 4 2017-09-22   369
## 5 2017-08-04   361
ggplot(by_date_tbl, aes(x=date_only, y=count)) + geom_line() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

by_month_tbl  = crime %>% group_by(YEAR,Month) %>%
  summarise(count=n())
## `summarise()` has grouped output by 'YEAR'. You can override using the
## `.groups` argument.
by_month_tbl %>% arrange(desc(count)) %>% head(5)
## # A tibble: 5 × 3
## # Groups:   YEAR [2]
##    YEAR Month count
##   <dbl> <fct> <int>
## 1  2017 Aug    9209
## 2  2017 Jul    9077
## 3  2017 Jun    8990
## 4  2017 Sep    8950
## 5  2016 Aug    8940
by_month_tbl = by_month_tbl %>% ungroup() %>% mutate(Month_Year=factor(interaction(Month, YEAR)))
Jan_levels = by_month_tbl %>% filter(Month=="Jan") %>% pull(Month_Year) %>% unique(.)
tibble(Jan_levels, as.numeric(Jan_levels))
## # A tibble: 3 × 2
##   Jan_levels `as.numeric(Jan_levels)`
##   <fct>                         <dbl>
## 1 Jan.2016                          8
## 2 Jan.2017                         20
## 3 Jan.2018                         32
ggplot(by_month_tbl,aes(x=Month_Year,y=count, group=1)) +
  geom_point() +   geom_line() +
  theme(axis.text.x = element_text(angle = 90)) +
  labs(x="Date",y="Count",title="Number of crimes by month") +
  geom_vline(xintercept=as.numeric(Jan_levels),
             col="red",linetype="dashed")

crime = crime %>% mutate(First_of_month = floor_date(OCCURRED_ON_DATE, "month"))
crime %>% slice(sample(x=1:n(),size=10)) %>%
  select(OCCURRED_ON_DATE,First_of_month)
## # A tibble: 10 × 2
##    OCCURRED_ON_DATE    First_of_month     
##    <dttm>              <dttm>             
##  1 2017-03-22 16:37:00 2017-03-01 00:00:00
##  2 2015-12-28 10:18:00 2015-12-01 00:00:00
##  3 2017-02-20 19:11:00 2017-02-01 00:00:00
##  4 2017-01-18 15:36:00 2017-01-01 00:00:00
##  5 2016-07-27 20:30:00 2016-07-01 00:00:00
##  6 2016-06-03 22:42:00 2016-06-01 00:00:00
##  7 2016-04-29 11:23:00 2016-04-01 00:00:00
##  8 2017-08-03 10:04:00 2017-08-01 00:00:00
##  9 2017-08-04 19:33:00 2017-08-01 00:00:00
## 10 2017-09-24 18:00:00 2017-09-01 00:00:00
by_month_tbl2  = crime %>% group_by(First_of_month) %>%
  summarise(count=n())

ggplot(by_month_tbl2,aes(x=First_of_month,y=count)) +
  geom_point() +   geom_line() +
  labs(x="Date",y="Count",title="Number of crimes by month") +
  geom_vline(xintercept=
               as.POSIXct(c("2016-01-01","2017-01-01","2018-01-01")),
             col="red",linetype="dashed")

1/0
## [1] Inf
exp(-Inf)
## [1] 0
0/0
## [1] NaN
sqrt(-1)
## Warning in sqrt(-1): NaNs produced
## [1] NaN
sqrt(as.complex(-1))
## [1] 0+1i
c(1,2,3,-Inf) + c(NA, Inf, NaN, Inf)
## [1]  NA Inf NaN NaN
as.numeric("My Missing Value")
## Warning: NAs introduced by coercion
## [1] NA
as.numeric(factor("My Missing Value"))
## [1] 1
c(1, "3")
## [1] "1" "3"
as.numeric(c(1, "3"))
## [1] 1 3
crime %>% summarise_all(list(~sum(is.na(.)))) %>% pivot_longer(cols=everything(), names_to = "Variable")
## # A tibble: 21 × 2
##    Variable            value
##    <chr>               <int>
##  1 INCIDENT_NUMBER         0
##  2 OFFENSE_CODE            0
##  3 OFFENSE_CODE_GROUP      0
##  4 OFFENSE_DESCRIPTION     0
##  5 DISTRICT             1774
##  6 REPORTING_AREA      20920
##  7 SHOOTING                0
##  8 OCCURRED_ON_DATE        0
##  9 YEAR                    0
## 10 MONTH                   0
## # … with 11 more rows
crime %>% summarise_all(list(~sum(is.na(.)))) %>% pivot_longer(cols=everything(), names_to = "Variable") %>% filter(value>0)
## # A tibble: 6 × 2
##   Variable       value
##   <chr>          <int>
## 1 DISTRICT        1774
## 2 REPORTING_AREA 20920
## 3 UCR_PART          93
## 4 STREET         10977
## 5 Lat            20632
## 6 Long           20632
crime_no_na <- crime %>% drop_na()
crime %>% summarise(n())
## # A tibble: 1 × 1
##    `n()`
##    <int>
## 1 327820
crime_no_na %>% summarise(n())
## # A tibble: 1 × 1
##    `n()`
##    <int>
## 1 304167
crime %>% drop_na(UCR_PART) %>% summarise(n())
## # A tibble: 1 × 1
##    `n()`
##    <int>
## 1 327727
crime %>% pull(code_lmp) %>% unique(.) %>% sort(.)
##  [1] Other                           Motor Vehicle Accident Response
##  [3] Larceny                         Medical Assistance             
##  [5] Investigate Person              Drug Violation                 
##  [7] Simple Assault                  Vandalism                      
##  [9] Verbal Disputes                 Towed                          
## [11] Investigate Property            Larceny From Motor Vehicle     
## 12 Levels: Other Motor Vehicle Accident Response Larceny ... Larceny From Motor Vehicle
crime = crime %>% 
  mutate(
    code_lmp_alt = recode(code_lmp,
                          'Investigate Person' = "Investigate",
                          'Investigate Property' = "Investigate",
                          'Motor Vehicle Accident Response' = "Motor Vehicle",
                          'Larceny From Motor Vehicle' = "Motor Vehicle"
                          )
    )
crime %>% group_by(code_lmp_alt) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 10 × 2
##    code_lmp_alt        count
##    <fct>               <int>
##  1 Other              122831
##  2 Motor Vehicle       49254
##  3 Investigate         30619
##  4 Larceny             26670
##  5 Medical Assistance  24226
##  6 Drug Violation      17037
##  7 Simple Assault      16263
##  8 Vandalism           15810
##  9 Verbal Disputes     13478
## 10 Towed               11632
crime %>% group_by(Location) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 18,255 × 2
##    Location                    count
##    <chr>                       <int>
##  1 (0.00000000, 0.00000000)    20632
##  2 (42.34862382, -71.08277637)  1276
##  3 (42.36183857, -71.05976489)  1248
##  4 (42.28482577, -71.09137369)  1137
##  5 (42.32866284, -71.08563401)  1075
##  6 (42.25621592, -71.12401947)   916
##  7 (42.29755533, -71.05970910)   794
##  8 (42.34128751, -71.05467933)   786
##  9 (-1.00000000, -1.00000000)    775
## 10 (42.33152148, -71.07085307)   760
## # … with 18,245 more rows
crime = crime %>% 
  mutate(
    Location_alt = fct_recode(Location,
                              NULL = "(-1.00000000, -1.00000000)",
                              NULL = "(0.00000000, 0.00000000)"
                              )
    )
crime %>% group_by(Location_alt) %>% summarise(count=n()) %>% arrange(desc(count))
## # A tibble: 18,254 × 2
##    Location_alt                count
##    <fct>                       <int>
##  1 <NA>                        21407
##  2 (42.34862382, -71.08277637)  1276
##  3 (42.36183857, -71.05976489)  1248
##  4 (42.28482577, -71.09137369)  1137
##  5 (42.32866284, -71.08563401)  1075
##  6 (42.25621592, -71.12401947)   916
##  7 (42.29755533, -71.05970910)   794
##  8 (42.34128751, -71.05467933)   786
##  9 (42.33152148, -71.07085307)   760
## 10 (42.35231190, -71.06370510)   707
## # … with 18,244 more rows